{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "87677c2a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " 词汇表： {'讨厌': 0, '我': 1, '喜欢': 2, '挨打': 3, '爸爸': 4, '玩具': 5, '爱': 6}\n",
      " 词汇表大小： 7\n"
     ]
    }
   ],
   "source": [
    "# 构建一个非常简单的数据集\n",
    "sentences = [\"我 喜欢 玩具\", \"我 爱 爸爸\", \"我 讨厌 挨打\"] \n",
    "# 将所有句子连接在一起，用空格分隔成多个词，再将重复的词去除，构建词汇表\n",
    "word_list = list(set(\" \".join(sentences).split())) \n",
    "# 创建一个字典，将每个词映射到一个唯一的索引\n",
    "word_to_idx = {word: idx for idx, word in enumerate(word_list)} \n",
    "# 创建一个字典，将每个索引映射到对应的词\n",
    "idx_to_word = {idx: word for idx, word in enumerate(word_list)} \n",
    "voc_size = len(word_list) # 计算词汇表的大小\n",
    "print(' 词汇表：', word_to_idx) # 打印词汇到索引的映射字典\n",
    "print(' 词汇表大小：', voc_size) # 打印词汇表大小"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0fb285df",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " 输入批处理数据： tensor([[1, 2],\n",
      "        [1, 0]])\n",
      " 输入批处理数据对应的原始词： [['我', '喜欢'], ['我', '讨厌']]\n",
      " 目标批处理数据： tensor([5, 3])\n",
      " 目标批处理数据对应的原始词： ['玩具', '挨打']\n"
     ]
    }
   ],
   "source": [
    "# 构建批处理数据\n",
    "import torch # 导入 PyTorch 库\n",
    "import random # 导入 random 库\n",
    "batch_size = 2 # 每批数据的大小\n",
    "def make_batch():\n",
    "    input_batch = []  # 定义输入批处理列表\n",
    "    target_batch = []  # 定义目标批处理列表\n",
    "    selected_sentences = random.sample(sentences, batch_size) # 随机选择句子\n",
    "    for sen in selected_sentences:  # 遍历每个句子\n",
    "        word = sen.split()  # 用空格将句子分隔成多个词\n",
    "        # 将除最后一个词以外的所有词的索引作为输入\n",
    "        input = [word_to_idx[n] for n in word[:-1]]  # 创建输入数据\n",
    "        # 将最后一个词的索引作为目标\n",
    "        target = word_to_idx[word[-1]]  # 创建目标数据\n",
    "        input_batch.append(input)  # 将输入添加到输入批处理列表\n",
    "        target_batch.append(target)  # 将目标添加到目标批处理列表\n",
    "    input_batch = torch.LongTensor(input_batch) # 将输入数据转换为张量\n",
    "    target_batch = torch.LongTensor(target_batch) # 将目标数据转换为张量\n",
    "    return input_batch, target_batch  # 返回输入批处理和目标批处理数据\n",
    "input_batch, target_batch = make_batch() # 生成批处理数据\n",
    "print(\" 输入批处理数据：\",input_batch)  # 打印输入批处理数据\n",
    "# 将输入批处理数据中的每个索引值转换为对应的原始词\n",
    "input_words = []\n",
    "for input_idx in input_batch:\n",
    "    input_words.append([idx_to_word[idx.item()] for idx in input_idx])\n",
    "print(\" 输入批处理数据对应的原始词：\",input_words)\n",
    "print(\" 目标批处理数据：\",target_batch) # 打印目标批处理数据\n",
    "# 将目标批处理数据中的每个索引值转换为对应的原始词\n",
    "target_words = [idx_to_word[idx.item()] for idx in target_batch]\n",
    "print(\" 目标批处理数据对应的原始词：\",target_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "87f165e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch.nn as nn # 导入神经网络模块\n",
    "# 定义神经概率语言模型（NPLM）\n",
    "class NPLM(nn.Module):\n",
    "    def __init__(self):\n",
    "        super(NPLM, self).__init__() # 调用父类的构造函数\n",
    "        self.C = nn.Embedding(voc_size, embedding_size) # 定义一个词嵌入层\n",
    "        # 用 LSTM 层替代第一个线性层，其输入大小为 embedding_size，隐藏层大小为 n_hidden\n",
    "        self.lstm = nn.LSTM(embedding_size, n_hidden, batch_first=True) \n",
    "        # 第二个线性层，其输入大小为 n_hidden，输出大小为 voc_size，即词汇表大小\n",
    "        self.linear = nn.Linear(n_hidden, voc_size) \n",
    "    def forward(self, X):  # 定义前向传播过程\n",
    "        # 输入数据 X 张量的形状为 [batch_size, n_step]\n",
    "        X = self.C(X)  # 将 X 通过词嵌入层，形状变为 [batch_size, n_step, embedding_size]\n",
    "        # 通过 LSTM 层\n",
    "        lstm_out, _ = self.lstm(X) # lstm_out 形状变为 [batch_size, n_step, n_hidden]\n",
    "        # 只选择最后一个时间步的输出作为全连接层的输入，通过第二个线性层得到输出 \n",
    "        output = self.linear(lstm_out[:, -1, :]) # output 的形状为 [batch_size, voc_size]\n",
    "        return output # 返回输出结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "9d214ced",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " NPLM 模型结构： NPLM(\n",
      "  (C): Embedding(7, 2)\n",
      "  (lstm): LSTM(2, 2, batch_first=True)\n",
      "  (linear): Linear(in_features=2, out_features=7, bias=True)\n",
      ")\n"
     ]
    }
   ],
   "source": [
    "n_step = 2 # 时间步数，表示每个输入序列的长度，也就是上下文长度 \n",
    "n_hidden = 2 # 隐藏层大小\n",
    "embedding_size = 2 # 词嵌入大小\n",
    "model = NPLM() # 创建神经概率语言模型实例\n",
    "print(' NPLM 模型结构：', model) # 打印模型的结构"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f3c3dfb5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch: 1000 cost = 0.000698\n",
      "Epoch: 2000 cost = 0.000196\n",
      "Epoch: 3000 cost = 0.000068\n",
      "Epoch: 4000 cost = 0.000045\n",
      "Epoch: 5000 cost = 0.000028\n"
     ]
    }
   ],
   "source": [
    "import torch.optim as optim # 导入优化器模块\n",
    "criterion = nn.CrossEntropyLoss() # 定义损失函数为交叉熵损失\n",
    "optimizer = optim.Adam(model.parameters(), lr=0.1) # 定义优化器为 Adam，学习率为 0.1\n",
    "# 训练模型\n",
    "for epoch in range(5000): # 设置训练迭代次数\n",
    "   optimizer.zero_grad() # 清除优化器的梯度\n",
    "   input_batch, target_batch = make_batch() # 创建输入和目标批处理数据\n",
    "   output = model(input_batch) # 将输入数据传入模型，得到输出结果\n",
    "   loss = criterion(output, target_batch) # 计算损失值\n",
    "   if (epoch + 1) % 1000 == 0: # 每 1000 次迭代，打印损失值\n",
    "     print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n",
    "   loss.backward() # 反向传播计算梯度\n",
    "   optimizer.step() # 更新模型参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "fcfa4b9b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['我', '讨厌'] -> 挨打\n",
      "['我', '喜欢'] -> 玩具\n"
     ]
    }
   ],
   "source": [
    "# 进行预测\n",
    "input_strs = [['我', '讨厌'], ['我', '喜欢']]  # 需要预测的输入序列\n",
    "# 将输入序列转换为对应的索引\n",
    "input_indices = [[word_to_idx[word] for word in seq] for seq in input_strs]\n",
    "# 将输入序列的索引转换为张量\n",
    "input_batch = torch.LongTensor(input_indices) \n",
    "# 对输入序列进行预测，取输出中概率最大的类别\n",
    "predict = model(input_batch).data.max(1)[1]  \n",
    "# 将预测结果的索引转换为对应的词\n",
    "predict_strs = [idx_to_word[n.item()] for n in predict.squeeze()]  \n",
    "for input_seq, pred in zip(input_strs, predict_strs):\n",
    "   print(input_seq, '->', pred)  # 打印输入序列和预测结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7660094b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
